In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext autotime
import numpy as np
import pandas as pd
import joblib
import datetime
import os
import numpy as np
import time
import multiprocessing as mp
import re
In [2]:
import inspect, os
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
os.sys.path.insert(0,parentdir)
from data_generation.diff_utils import clean_and_filter
In [3]:
from baselines import plurality, average
from serialization import load_pipeline
In [4]:
all_annotations = pd.read_csv('../../data/annotations/clean/annotations.tsv', sep='\t')
all_annotations.index = all_annotations.rev_id
In [10]:
model_name = 'linear_char_ed_train'
model_type = 'linear_char_ed'
tasks = ['aggression', 'attack', 'recipient']
model_dict = {}
calibrator_dict = {}
for task in tasks:
path = '../../models/%s/%s' % (task, model_type)
model_dict[task] = load_pipeline(path, model_name)
calibrator_dict[task] = joblib.load(os.path.join(path, 'calibrator'))
In [12]:
def apply_models(df):
diffs = df['clean_diff']
for task, model in model_dict.items():
scores = model.predict_proba(diffs)[:,1]
df['pred_%s_score_uncalibrated' % task] = scores
df['pred_%s_score_calibrated' % task] = calibrator_dict[task].transform(scores)
return df
In [7]:
cols = ['rev_id', 'ns', 'sample', 'src', 'clean_diff', 'diff', 'page_id', 'page_title', 'rev_comment', 'rev_timestamp', 'user_id', 'user_text']
for ns in ['user', 'article']:
d_annotations = all_annotations.query("sample=='random' and ns=='%s'" % ns)
d_annotated = d_annotations\
.drop_duplicates(subset=['rev_id'])[cols]\
.assign(
recipient = plurality(d_annotations['recipient'].dropna()),
recipient_score = average(d_annotations['recipient'].dropna()),
aggression = plurality(d_annotations['aggression'].dropna()),
aggression_score = average(d_annotations['aggression'].dropna()),
attack = plurality(d_annotations['attack'].dropna()),
attack_score = average(d_annotations['attack'].dropna())
)
d_annotated.to_csv('../../data/samples/%s/clean/d_annotated.tsv' % ns, sep = '\t', index = False)
del all_annotations
In [8]:
def pred_helper(df):
if len(df) == 0:
return None
return df.assign(rev_timestamp = lambda x: pd.to_datetime(x.rev_timestamp),
clean_diff = lambda x: x['clean_diff'].astype(str))\
.pipe(apply_models)
def prep_in_parallel(path, k = 8):
df = pd.read_csv(path, sep = '\t', encoding = 'utf-8')
m = df.shape[0]
n_groups = int(m / 10000.0)
df['key'] = np.random.randint(0, high=n_groups, size=m)
dfs = [e[1] for e in df.groupby('key')]
dfs = [pred_helper(d) for d in dfs]
#p = mp.Pool(k)
#dfs = p.map(pred_helper, dfs)
#p.close()
#p.join()
return pd.concat(dfs)
In [13]:
base = '../../data/comments/'
nss = ['user', 'article']
samples = ['d_annotated.tsv', 'talk_diff_no_admin_sample.tsv', 'talk_diff_no_admin_2015.tsv', 'all_blocked_user.tsv']
base_cols = ['rev_id',
'clean_diff',
'rev_timestamp',
'pred_aggression_score_uncalibrated',
'pred_recipient_score_uncalibrated',
'pred_attack_score_uncalibrated',
'pred_aggression_score_calibrated',
'pred_recipient_score_calibrated',
'pred_attack_score_calibrated',
'page_title',
'user_text',
'user_id'
]
extra_cols = ['recipient', 'recipient_score', 'aggression', 'aggression_score', 'attack', 'attack_score']
for ns in nss:
for s in samples:
inf = os.path.join(base, ns, 'clean', s)
print(inf)
outf = os.path.join(base, ns, 'scored', s)
if s == 'd_annotated.tsv':
cols = base_cols + extra_cols
else:
cols = base_cols
prep_in_parallel(inf, k = 4)[cols].to_csv(outf, sep = '\t', index = False)
In [ ]: